{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('ms-en/translated-trainset-parliament.json') as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "rejected = ['PERTANYAAN-PERTANYAAN JAWAB LISAN', 'PENGGAL KEEMPAT', 'PUSAT JAGAAN BERDAFTAR',\n",
    "           'BILANGAN PUSAT JAGAAN', 'pewan']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50460\n"
     ]
    }
   ],
   "source": [
    "selected, reject = [], []\n",
    "for row in data:\n",
    "    if any([r.lower() in row[0].lower() for r in rejected]):\n",
    "        reject.append(row)\n",
    "        continue\n",
    "    s = row[0]\n",
    "    if (sum(c.isdigit() for c in s) / len(s)) > 0.15:\n",
    "        reject.append(row)\n",
    "        continue\n",
    "    if sum(c.isalpha() for c in s) == 0:\n",
    "        reject.append(row)\n",
    "        continue\n",
    "    selected.append(row)\n",
    "    \n",
    "print(len(selected))\n",
    "x_parliament, y_parliament = list(zip(*selected))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "with open('ms-en/ubuntu-ms-en.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "X, Y = list(zip(*data))\n",
    "X = list(X)\n",
    "Y = list(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('ms-en/qed-ms-en.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "x, y = list(zip(*data))\n",
    "X.extend(x)\n",
    "Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('ms-en/tanzil-ms-en.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "x, y = list(zip(*data))\n",
    "X.extend(x)\n",
    "Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ms-en/translated-3200000.json',\n",
       " 'ms-en/translated-700000.json',\n",
       " 'ms-en/translated-2100000.json',\n",
       " 'ms-en/translated-3300000.json',\n",
       " 'ms-en/translated-2300000.json',\n",
       " 'ms-en/translated-2000000.json',\n",
       " 'ms-en/translated-600000.json',\n",
       " 'ms-en/translated-900000.json',\n",
       " 'ms-en/translated-1000000.json',\n",
       " 'ms-en/translated-1100000.json',\n",
       " 'ms-en/translated-1900000.json',\n",
       " 'ms-en/translated-500000.json',\n",
       " 'ms-en/translated-1400000.json',\n",
       " 'ms-en/translated-1500000.json',\n",
       " 'ms-en/translated-2600000.json',\n",
       " 'ms-en/translated-200000.json',\n",
       " 'ms-en/translated-2900000.json',\n",
       " 'ms-en/translated-3400000.json',\n",
       " 'ms-en/translated-3500000.json',\n",
       " 'ms-en/translated-2800000.json',\n",
       " 'ms-en/translated-300000.json',\n",
       " 'ms-en/translated-2500000.json',\n",
       " 'ms-en/translated-3100000.json',\n",
       " 'ms-en/translated-1300000.json',\n",
       " 'ms-en/translated-2400000.json',\n",
       " 'ms-en/translated-100000.json',\n",
       " 'ms-en/translated-1600000.json',\n",
       " 'ms-en/translated-2700000.json',\n",
       " 'ms-en/translated-0.json',\n",
       " 'ms-en/translated-800000.json',\n",
       " 'ms-en/translated-1800000.json',\n",
       " 'ms-en/translated-2200000.json',\n",
       " 'ms-en/translated-1200000.json',\n",
       " 'ms-en/translated-1700000.json',\n",
       " 'ms-en/translated-3000000.json',\n",
       " 'ms-en/translated-400000.json']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "translated = glob('ms-en/translated*0.json')\n",
    "translated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in translated:\n",
    "    with open(file) as fopen:\n",
    "        data = json.load(fopen)\n",
    "\n",
    "    x, y = list(zip(*data))\n",
    "    X.extend(x)\n",
    "    Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "from unidecode import unidecode\n",
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = unidecode(string).replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string\n",
    "\n",
    "def check(string):\n",
    "    string = re.sub('[^A-Za-z\\- ]+', ' ', string)\n",
    "    string = re.sub(r'[ ]+', ' ', string.lower()).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ms-en/informations-100000.json.translate 1.982215263559356\n",
      "ms-en/dialogs.translate 2.4126386389843635\n",
      "ms-en/informations-200000.json.translate 1.6322194321035348\n",
      "ms-en/informations-0.json.translate 2.2543461041397426\n",
      "ms-en/convai2-100000.json.translate 1.8807818751752086\n",
      "ms-en/convai2-0.json.translate 2.149234888669429\n"
     ]
    }
   ],
   "source": [
    "translated = glob('ms-en/*.translate')\n",
    "\n",
    "x, y = [], []\n",
    "\n",
    "rejected = ['lexus', 'little', 'lizards', 'lizard']\n",
    "\n",
    "for t in translated:\n",
    "    \n",
    "    if 'rephrase.json' in t:\n",
    "        continue\n",
    "        \n",
    "    with open(t) as fopen:\n",
    "        data = json.load(fopen)\n",
    "        \n",
    "    count = 0\n",
    "    for no, row in enumerate(data):\n",
    "        splitted = row[0]['text'].split('<>')\n",
    "        splitted_bm = row[1].split('<>')\n",
    "        if len(splitted) != len(splitted_bm):\n",
    "            count += 1\n",
    "            continue\n",
    "        \n",
    "        for k in range(len(splitted)):\n",
    "            s = check(splitted[k])\n",
    "            if any([r in s for r in rejected]):\n",
    "                continue\n",
    "            y.append(splitted[k])\n",
    "            x.append(splitted_bm[k])\n",
    "        \n",
    "    print(t, count / len(data) * 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "X.extend(x)\n",
    "Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(' saya suka masakan Itali. ', ' i like italian cusine. '),\n",
       " (' Walaupun negara yang dikenali sebagai Itali itu tidak bersatu hingga abad ke-19, masakan dapat memperoleh akar yang dapat ditelusuri sejak abad ke-4 SM.',\n",
       "  ' Although the country known as Italy did not unite until the 19th century, the cuisine can claim traceable roots as far back as the 4th century BCE.'),\n",
       " ('Muzik rock ', 'Rock music '),\n",
       " (' muzik rock adalah kegemaran saya. ', ' rock music is my favorite. '),\n",
       " (' Muzik rock juga menarik perhatian pada beberapa genre lain seperti blues elektrik dan folk, dan menggabungkan pengaruh dari gaya muzik jazz, klasik dan muzik lain.',\n",
       "  ' Rock music also drew strongly on a number of other genres such as electric blues and folk, and incorporated influences from jazz, classical and other musical styles.'),\n",
       " ('Masakan Itali ', 'Italian cuisine '),\n",
       " (' saya suka masakan Itali. ', ' i like italian cusine. '),\n",
       " (' Masakan Itali mungkin merupakan ungkapan terpenting dalam diet Mediterranean.',\n",
       "  ' Italian cuisine is probably the most important expression of the Mediterranean diet.'),\n",
       " ('Masakan Itali ', 'Italian cuisine '),\n",
       " (' saya suka masakan Itali. ', ' i like italian cusine. '),\n",
       " (' Masakan Itali telah berkembang selama berabad-abad.',\n",
       "  ' Italian cuisine has developed over the centuries.'),\n",
       " ('Muzik rock ', 'Rock music '),\n",
       " (' muzik rock adalah kegemaran saya. ', ' rock music is my favorite. '),\n",
       " (' Secara muzik, rock berpusat pada gitar elektrik, biasanya sebagai sebahagian daripada kumpulan rock dengan bass elektrik dan drum dan satu atau lebih penyanyi.',\n",
       "  ' Musically, rock has centered on the electric guitar, usually as part of a rock group with electric bass and drums and one or more singers.'),\n",
       " ('Masakan Itali ', 'Italian cuisine '),\n",
       " (' saya suka masakan Itali. ', ' i like italian cusine. '),\n",
       " (' Pasta, sayur-sayuran, minyak zaitun dan ikan adalah bahagian utama dari masakan Itali.',\n",
       "  ' Pasta, vegetables, olive oil and fish are a major part of the Italian cuisine.'),\n",
       " ('Masakan Itali ', 'Italian cuisine '),\n",
       " (' saya suka masakan Itali. ', ' i like italian cusine. '),\n",
       " (' Bahan dan hidangan berbeza mengikut kawasan.',\n",
       "  ' Ingredients and dishes vary by region.'),\n",
       " ('Muzik rock ', 'Rock music '),\n",
       " (' muzik rock adalah kegemaran saya. ', ' rock music is my favorite. '),\n",
       " (' Biasanya, rock adalah muzik berdasarkan lagu biasanya dengan tanda tangan 4/4 menggunakan bentuk ayat-korus, tetapi genrenya menjadi sangat pelbagai.',\n",
       "  ' Typically, rock is song-based music usually with a 4/4 time signature using a verse–chorus form, but the genre has become extremely diverse.'),\n",
       " ('Muzik rock ', 'Rock music '),\n",
       " (' muzik rock adalah kegemaran saya. ', ' rock music is my favorite. '),\n",
       " (' Seperti muzik pop, lirik sering menekankan cinta romantis tetapi juga membahas pelbagai tema lain yang sering bersifat sosial atau politik.',\n",
       "  ' Like pop music, lyrics often stress romantic love but also address a wide variety of other themes that are frequently social or political.'),\n",
       " ('Muzik rock ', 'Rock music '),\n",
       " (' muzik rock adalah kegemaran saya. ', ' rock music is my favorite. '),\n",
       " (' Pada akhir tahun 1960-an \"rock klasik\", sejumlah subgenre muzik rock yang berbeza telah muncul, termasuk hibrida seperti blues rock, folk rock, country rock, raga rock, dan jazz-rock, banyak yang menyumbang kepada pengembangan psychedelic rock, yang dipengaruhi oleh adegan psychedelic dan hippie kontra budaya.',\n",
       "  ' By the late 1960s \"classic rock\" period, a number of distinct rock music subgenres had emerged, including hybrids like blues rock, folk rock, country rock, raga rock, and jazz-rock, many of which contributed to the development of psychedelic rock, which was influenced by the countercultural psychedelic and hippie scene.'),\n",
       " ('Telur rebus ', 'Boiled egg '),\n",
       " (' saya suka makan telur rebus. ', ' i like to eat hard boiled eggs. '),\n",
       " (' Telur rebus dimasak sehingga putih telur dan kuning telur mengeras, sementara telur rebus kuning telur, dan kadang-kadang putih, kekal sekurang-kurangnya sebahagiannya cair.',\n",
       "  ' Hard-boiled eggs are cooked so that the egg white and egg yolk both solidify, while for a soft-boiled egg the yolk, and sometimes the white, remain at least partially liquid.'),\n",
       " ('Telur rebus ', 'Boiled egg '),\n",
       " (' saya suka makan telur rebus. ', ' i like to eat hard boiled eggs. '),\n",
       " (' Telur rebus adalah telur (biasanya telur ayam) dimasak dengan kulitnya yang tidak pecah, biasanya dengan merendam dalam air mendidih.',\n",
       "  ' Boiled eggs are eggs (typically chicken eggs) cooked with their shells unbroken, usually by immersion in boiling water.'),\n",
       " ('Telur rebus ', 'Boiled egg '),\n",
       " (' saya suka makan telur rebus. ', ' i like to eat hard boiled eggs. '),\n",
       " (' Beberapa kaedah yang berbeza digunakan untuk membuat telur rebus selain hanya merendamnya dalam air mendidih.',\n",
       "  ' A few different methods are used to make boiled eggs other than simply immersing them in boiling water.'),\n",
       " ('Telur rebus ', 'Boiled egg '),\n",
       " (' saya suka makan telur rebus. ', ' i like to eat hard boiled eggs. '),\n",
       " (' Variasi ini termasuk: Chef Heston Blumenthal, setelah \"percubaan tanpa henti\", menerbitkan formula untuk \"telur rebus yang sempurna\" yang menjelaskan berapa banyak air yang harus digunakan, berapa banyak masa untuk memasak dan berapa banyak waktu untuk merehatkan telur.',\n",
       "  ' These variations include:  Chef Heston Blumenthal, after \"relentless trials\", published a formula for \"the perfect boiled egg\" that explains how much water to use, how much time to cook and how much time to rest the egg.'),\n",
       " ('Telur rebus ', 'Boiled egg '),\n",
       " (' saya suka makan telur rebus. ', ' i like to eat hard boiled eggs. '),\n",
       " (' Terdapat variasi baik dalam tahap memasak dan dalam cara bagaimana telur direbus, dan berbagai alat dapur untuk telur ada.',\n",
       "  ' There are variations both in degree of cooking and in the method of how eggs are boiled, and a variety of kitchen gadgets for eggs exist.'),\n",
       " ('Telur rebus ', 'Boiled egg '),\n",
       " (' saya suka makan telur rebus. ', ' i like to eat hard boiled eggs. '),\n",
       " (' Telur rebus tidak digalakkan untuk orang yang rentan terhadap salmonella, seperti anak-anak yang sangat muda, orang tua, dan mereka yang mempunyai sistem imun yang lemah.',\n",
       "  ' Soft-boiled eggs are not recommended for people who may be susceptible to salmonella, such as very young children, the elderly, and those with weakened immune systems.'),\n",
       " ('Mars ', 'Mars '),\n",
       " (' saya mahu pergi ke mars. ', ' i want to go to mars. '),\n",
       " (' Dalam bahasa Inggeris, Mars membawa nama dewa perang Rom, dan sering disebut sebagai \"Planet Merah\" kerana oksida besi kemerahan yang terdapat di permukaannya memberikan penampilan kemerahan yang khas di antara badan astronomi yang dapat dilihat oleh mata kasar.',\n",
       "  ' In English, Mars carries a name of the Roman god of war, and is often referred to as the \"Red Planet\" because the reddish iron oxide prevalent on its surface gives it a reddish appearance that is distinctive among the astronomical bodies visible to the naked eye.'),\n",
       " ('Telur rebus ', 'Boiled egg '),\n",
       " (' saya suka makan telur rebus. ', ' i like to eat hard boiled eggs. '),\n",
       " (' Pemasa telur diberi nama karena penggunaannya yang biasa dalam menentukan waktu mendidih telur.',\n",
       "  ' The egg timer was named due to its common usage in timing the boiling of eggs.'),\n",
       " ('Telur rebus ', 'Boiled egg '),\n",
       " (' saya suka makan telur rebus. ', ' i like to eat hard boiled eggs. '),\n",
       " (' Telur rebus juga dapat dimasak di bawah suhu mendidih, melalui pengepungan, atau mereka boleh dikukus.',\n",
       "  ' Boiled eggs can also be cooked below the boiling temperature, via coddling, or they can be steamed.'),\n",
       " ('Mars ', 'Mars '),\n",
       " (' saya mahu pergi ke mars. ', ' i want to go to mars. '),\n",
       " (' Marikh adalah planet darat dengan atmosfer yang tipis, mempunyai ciri permukaan yang mengingatkan kedua-dua kawah hentaman Bulan dan lembah, gurun, dan lapisan es kutub Bumi.',\n",
       "  ' Mars is a terrestrial planet with a thin atmosphere, having surface features reminiscent both of the impact craters of the Moon and the valleys, deserts, and polar ice caps of Earth.'),\n",
       " ('Mars ', 'Mars '),\n",
       " (' saya mahu pergi ke mars. ', ' i want to go to mars. '),\n",
       " (' Marikh adalah planet keempat dari Matahari dan planet terkecil kedua di Sistem Suria selepas Merkuri.',\n",
       "  ' Mars is the fourth planet from the Sun and the second-smallest planet in the Solar System after Mercury.'),\n",
       " ('Mars ', 'Mars '),\n",
       " (' saya mahu pergi ke mars. ', ' i want to go to mars. '),\n",
       " (' Mars adalah tapak Olympus Mons, gunung berapi terbesar dan gunung kedua tertinggi yang diketahui di Sistem Suria, dan Valles Marineris, salah satu ngarai terbesar di Sistem Suria.',\n",
       "  ' Mars is the site of Olympus Mons, the largest volcano and second-highest known mountain in the Solar System, and of Valles Marineris, one of the largest canyons in the Solar System.'),\n",
       " ('Mars ', 'Mars '),\n",
       " (' saya mahu pergi ke mars. ', ' i want to go to mars. '),\n",
       " (' Masa putaran dan kitaran bermusim Marikh juga serupa dengan Bumi, seperti kemiringan yang menghasilkan musim.',\n",
       "  ' The rotational period and seasonal cycles of Mars are likewise similar to those of Earth, as is the tilt that produces the seasons.'),\n",
       " ('Telur rebus ', 'Boiled egg '),\n",
       " (' saya suka makan telur rebus. ', ' i like to eat hard boiled eggs. '),\n",
       " (' Telur rebus adalah makanan sarapan pagi yang popular di banyak negara di seluruh dunia.',\n",
       "  ' Boiled eggs are a popular breakfast food in many countries around the world.'),\n",
       " ('Telur rebus ', 'Boiled egg '),\n",
       " (' saya suka makan telur rebus. ', ' i like to eat hard boiled eggs. '),\n",
       " (' Untuk mengelakkan masalah salmonella, telur dapat dipasteurisasi dalam cengkerang pada suhu 57 ºC selama satu jam dan 15 minit.',\n",
       "  ' To avoid the issue of salmonella, eggs can be pasteurised in shell at 57 ºC for an hour and 15 minutes.'),\n",
       " ('Mars ', 'Mars '),\n",
       " (' saya mahu pergi ke mars. ', ' i want to go to mars. '),\n",
       " (' Lembangan Borealis yang halus di hemisfera utara meliputi 40% planet ini dan mungkin merupakan ciri kesan besar.',\n",
       "  ' The smooth Borealis basin in the northern hemisphere covers 40% of the planet and may be a giant impact feature.'),\n",
       " ('Mars ', 'Mars '),\n",
       " (' saya mahu pergi ke mars. ', ' i want to go to mars. '),\n",
       " (' Mars mempunyai dua bulan, Phobos dan Deimos, yang kecil dan berbentuk tidak teratur.',\n",
       "  ' Mars has two moons, Phobos and Deimos, which are small and irregularly shaped.'),\n",
       " ('Mars ', 'Mars '),\n",
       " (' saya mahu pergi ke mars. ', ' i want to go to mars. '),\n",
       " (' Ini mungkin ditangkap asteroid, mirip dengan 5261 Eureka, trojan Mars.',\n",
       "  ' These may be captured asteroids, similar to 5261 Eureka, a Mars trojan.'),\n",
       " ('Mars ', 'Mars '),\n",
       " (' saya mahu pergi ke mars. ', ' i want to go to mars. '),\n",
       " (' Terdapat penyelidikan yang sedang dilakukan yang menilai potensi kebiasaan Mars di masa lalu, serta kemungkinan kehidupan yang masih ada.',\n",
       "  ' There are ongoing investigations assessing the past habitability potential of Mars, as well as the possibility of extant life.'),\n",
       " ('Balet ', 'Ballet '),\n",
       " (' ibu saya adalah penari balet. ', ' my mother is a ballet dancer. '),\n",
       " (' Ballet () adalah sejenis tarian persembahan yang berasal dari zaman Renaissance Itali pada abad ke-15 dan kemudian berkembang menjadi bentuk tarian konsert di Perancis dan Rusia.',\n",
       "  ' Ballet () is a type of performance dance that originated during the Italian Renaissance in the 15th century and later developed into a concert dance form in France and Russia.'),\n",
       " ('Balet ', 'Ballet '),\n",
       " (' ibu saya adalah penari balet. ', ' my mother is a ballet dancer. '),\n",
       " (' Sejak itu ia menjadi bentuk tarian yang sangat luas dan sangat teknikal dengan perbendaharaan kata tersendiri berdasarkan terminologi Perancis.',\n",
       "  ' It has since become a widespread, highly technical form of dance with its own vocabulary based on French terminology.'),\n",
       " ('Balet ', 'Ballet '),\n",
       " (' ibu saya adalah penari balet. ', ' my mother is a ballet dancer. '),\n",
       " (' Ballet telah diajar di berbagai sekolah di seluruh dunia, yang secara historis menggabungkan budaya mereka sendiri untuk mengembangkan seni.',\n",
       "  ' Ballet has been taught in various schools around the world, which have historically incorporated their own cultures to evolve the art.'),\n",
       " ('Balet ', 'Ballet '),\n",
       " (' ibu saya adalah penari balet. ', ' my mother is a ballet dancer. '),\n",
       " (' Ini berpengaruh secara global dan telah menentukan teknik asas yang digunakan dalam banyak genre / budaya tarian lain.',\n",
       "  ' It has been globally influential and has defined the foundational techniques used in many other dance genres/cultures.'),\n",
       " ('Balet ', 'Ballet '),\n",
       " (' ibu saya adalah penari balet. ', ' my mother is a ballet dancer. ')]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "i = random.randint(0, len(x) - 100)\n",
    "list(zip(x[i: i + 100], y[i: i + 100]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('en-ms/texts.json.translate') as fopen:\n",
    "    news = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(news)):\n",
    "    l = news[i][0]\n",
    "    r = news[i][1]\n",
    "    l_len = len(l.split()) / 2 < len(r.split())\n",
    "    r_len = len(r.split()) / 2 < len(l.split())\n",
    "    if l != r and l_len and r_len:\n",
    "        X.append(news[i][1])\n",
    "        Y.append(news[i][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in glob('en-ms/dataset-*.json.translate')[:6]:\n",
    "    with open(file) as fopen:\n",
    "        news = json.load(fopen)\n",
    "        \n",
    "    for i in range(len(news)):\n",
    "        l = news[i][0]\n",
    "        r = news[i][1]\n",
    "        l_len = len(l.split()) / 2 < len(r.split())\n",
    "        r_len = len(r.split()) / 2 < len(l.split())\n",
    "        if l != r and l_len and r_len:\n",
    "            X.append(news[i][1])\n",
    "            Y.append(news[i][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5770106/5770106 [02:17<00:00, 42028.11it/s]\n"
     ]
    }
   ],
   "source": [
    "filtered_X, filtered_Y = [], []\n",
    "\n",
    "for i in tqdm(range(len(X))):\n",
    "    X[i] = cleaning(X[i])\n",
    "    Y[i] = cleaning(Y[i])\n",
    "    if len(X[i]) and len(Y[i]):\n",
    "        filtered_X.append(X[i])\n",
    "        filtered_Y.append(Y[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5769965/5769965 [00:02<00:00, 2042231.91it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "4.5742565162873605"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count, ids = 0, []\n",
    "for i in tqdm(range(len(filtered_X))):\n",
    "    if filtered_X[i] == filtered_Y[i]:\n",
    "        count += 1\n",
    "        ids.append(i)\n",
    "        \n",
    "count / len(filtered_X) * 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5769965/5769965 [00:07<00:00, 754717.13it/s] \n"
     ]
    }
   ],
   "source": [
    "uniques = set()\n",
    "for i in tqdm(range(len(filtered_X))):\n",
    "    s = f'{filtered_X[i]} [EENNDD] {filtered_Y[i]}'\n",
    "    uniques.add(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4004364"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniques = list(uniques)\n",
    "len(uniques)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 4004364/4004364 [03:04<00:00, 21648.58it/s]\n"
     ]
    }
   ],
   "source": [
    "X, Y = [], []\n",
    "for i in tqdm(range(len(uniques))):\n",
    "    x, y = uniques[i].split(' [EENNDD] ')\n",
    "    xc = check(x)\n",
    "    yc = check(y)\n",
    "    xc_len = len(xc.split()) / 2 > len(yc)\n",
    "    yc_len = len(yc.split()) / 2 > len(xc)\n",
    "    if xc == yc or xc_len or yc_len:\n",
    "        continue\n",
    "    X.append(x)\n",
    "    Y.append(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3884322/3884322 [00:17<00:00, 223001.73it/s]\n"
     ]
    }
   ],
   "source": [
    "X_len, Y_len = [], []\n",
    "\n",
    "for i in tqdm(range(len(X))):\n",
    "    X_len.append(len(X[i].split()))\n",
    "    Y_len.append(len(Y[i].split()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(759, 1823)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max(X_len), max(Y_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1258436, 1298173, 3707655, 1288348, 3563209, 3673957, 1621546,\n",
       "         82810,  105993, 1254225, 2900874, 2349546, 2847171, 3860956,\n",
       "       1253550, 3579518, 3775526,  736604, 1860351, 1319796,  801434,\n",
       "       2100682, 3791157, 2007760, 2727713, 1254139,  316272, 3274706,\n",
       "       1829159, 3364107,  218956,  876776, 2335361, 2086756, 3003057,\n",
       "       3793888, 1269982, 1624684,  723124, 1321969, 2628889, 3095791,\n",
       "       1209735,  282673, 3042944, 3804830, 3197883, 2726497, 2673245,\n",
       "       3527848, 2325913, 2234196, 2162455, 1778472,  567554,   94254,\n",
       "       2650807,  538274, 2522759, 1914661, 2759808,  456546, 3018993,\n",
       "       2285896, 2194293, 1430077, 2126997, 3566415, 1528865, 2605086,\n",
       "       2404427, 2964754, 1489189,  944415, 1803209, 3285211, 3745738,\n",
       "       3676843, 1085759, 3451236, 3098539, 1723657, 2544830,  133586,\n",
       "        994771, 1168980,  325442, 1739860, 3216426, 2328822,  451184,\n",
       "       3852511, 3746859,  771027, 2280267, 1984742, 3515696,  897578,\n",
       "        198255, 2065592])"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "np.argsort(X_len)[::-1][:100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Di Jogja, sambil bercuti, Karmen dan Milly melihat Rangga di jalan raya, dan mereka memberitahu Cinta mengenainya.',\n",
       "  'In Jogja, on vacation, Karmen and Milly see Sky on the road, and they tell Love about it.'),\n",
       " ('\"SPRM ingin menegaskan bahawa siasatan yang dijalankan oleh SPRM tidak menghalang atau menyekat perolehan atau pembelian oleh JBPM,\" katanya, hari ini.',\n",
       "  '\"The MACC wants to emphasize that the investigation conducted by the MACC does not hinder or impede the acquisition or purchase by JBPM,\" he said today.'),\n",
       " ('Poskod yang digunakan di Mertajaya adalah 46475.',\n",
       "  'The postal code used in Charging is 46475.'),\n",
       " ('Ia diadakan pada 11 Jun hingga 17 Jun, 2007 di Glasgow, Scotland.',\n",
       "  'It is held from June 11 to June 17, 2007 in Glasgow, Scotland.'),\n",
       " ('\"Bahkan, menerusi komen penonton di platfrom media sosial seperti Instagram (IG) dan Tweeter juga menunjukkan sokongan yang bagus.',\n",
       "  '\"In fact, through the comments of viewers on social media platforms such as Instagram (IG) and Tweeter also showed great support.'),\n",
       " ('Dia pergi walaupun saya suruh dia tunggu',\n",
       "  'He left even though I told him to wait for me.'),\n",
       " ('Jombang, Ciputat Jombang merupakan sebuah desa yang terletak dalam (\"daerah\") kecamatan Ciputat, Kota Tangerang Selatan, Provinsi Banten, Indonesia.',\n",
       "  'J Wave, Ciputat J Wave is a village located in the (\"district\") district of Ciputat, South Tangerang City, Banten Province, Indonesia.'),\n",
       " ('Saya suka berenang! Mendorong sendiri melalui garam atau air tawar adalah yang terbaik. Adakah awak suka berenang?',\n",
       "  'I love to go swimming! Self propelling yourself through salt or fresh water is the best. Do you like to swim?'),\n",
       " ('Terdapat juga kritikan bahawa anggota masyarakat setempat tidak diberi konsultasi yang memadai. Pemerintah Liberal sekarang sedang mengkaji kemungkinan membuka kembali ladang - dimulai dengan dua di wilayah Kingston, dan meminta warga Kanada untuk mempertimbangkan masalah ini melalui tinjauan dalam talian. Balai bandar juga dirancang di Kingston pada tarikh yang belum ditentukan, untuk membolehkan penduduk tempatan dan pihak berkepentingan lain berkongsi pendapat mereka. \"Sekiranya mereka dapat menyelamatkan lelaki lain seperti saya, mereka harus membiarkannya terbuka,\" kata Kincaid mengenai ladang susu di mana dia pernah membersihkan gerai, memerah susu lembu dan membantu anak lembu kelahiran. \"Ini membuat masa saya cepat. Saya bahkan tidak menyedari bahawa saya melakukan masa ketika saya berada di gudang\".',\n",
       "  'There was also criticism that local community members had not been adequately consulted. The Liberal government is now studying the possibility of reopening the farms -- starting with two in the Kingston area, and is asking Canadians to weigh in on the issue through an online survey. A town hall is also planned in Kingston at a yet-to-be-determined date, to allow local residents and other stakeholders to share their thoughts. \"If they could save another guy like me, they should keep that barn open,\" Kincaid said of the dairy farm where he once cleaned stalls, milked cows and helped birth calves. \"It made my time go quick. I didn\\'t even realize I was doing time when I was in the barn\".'),\n",
       " ('Ular bertubuh serdahana hingga agak besar, kekar, mampu mencapai panjang , namun kebanyakan hanya sekitar saja.',\n",
       "  'The snake is medium to large, strong, able to reach long, but mostly just around.')]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(zip(X[-10:], Y[-10:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('en-ms.json', 'w') as fopen:\n",
    "    json.dump({'left': Y, 'right': X}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('english-en-ms.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(Y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('malay-en-ms.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(X))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
